# Replication code for Taylor C. Boas, Dino P. Christenson, and David M. Glick, "Recruiting Large Online Samples in the United States and India: Facebook, Mechanical Turk and Qualtrics," Political Science Research and Methods.

# Analysis conducted in R 3.4.3 on MacOS 10.13.2

# NOTE: This file merges external geographic data into the cleaned and combined India survey data file. It also calculates population weights for the India survey data. Files should be run in the following order; please see readme.txt for details.
# 	1. clean_us_survey.R
# 	2. clean_india_survey.R
# 	3. merge_external_data_us.R
# 	4. merge_external_data_india.R
# 	5. analyze_demographics.R
# 	6. analyze_spaces.R
# 	7. analyze_politics.R
# 	8. analyze_cooperativeness.R
# 	9. analyze_experiments.R

# Set working directory as appropriate
# setwd('~/Dropbox/sample recruitment shared/replication/')

# Clean desktop and load packages. Please make sure all necessary packages are installed.
rm(list=ls(all=T))
library(ggmap)

# Load data files; save variable names
load('india.RData')
varnames.india<-names(india)

##########################################
# Merge India district-level census data
# with pincodes
##########################################

load('census_pop_districts.RData')
districts<-districts[,c('State','District','Level','Name','TRU','Area','Total.Population.Person')]
districts$State<-as.numeric(gsub("'","", districts$State))
statenames<-unique(districts[districts$Level=='STATE',c('State','Name')])
names(statenames)[2]<-'statename'
districts<-districts[districts$Level=='DISTRICT',]
districts<-merge(districts,statenames)

pincodes<-read.csv('all_india_PO_list_without_APS_offices_ver2_lat_long.csv',colClasses='character')
pincodes<-unique(pincodes[,c('pincode','Districtname','statename')])

# Reduce pincode database to those in the survey data (and each unique district) 
india$Q25<-gsub(' *', '', india$Q25) # Remove spaces
pincodes<-pincodes[pincodes$pincode %in% india$Q25,]

# Change state spellings/names in pincode file to match census data
pincodes$statename[pincodes$statename=='TELANGANA']<-'ANDHRA PRADESH'
pincodes$statename[pincodes$statename=='CHATTISGARH']<-'CHHATTISGARH'
pincodes$statename[pincodes$statename=='PONDICHERRY']<-'PUDUCHERRY'
pincodes$statename[pincodes$statename=='DELHI']<-'NCT OF DELHI'

# Kill leading/trailing spaces
districts$Name<-gsub(' *$','', districts$Name)
districts$Name<-gsub('^ *','', districts$Name)

# Exact matches
dist.census<-apply(pincodes[,c('Districtname','statename')],1,function(x) unique(districts$Name[which(districts$statename==x[2])][which(districts$Name[which(districts$statename==x[2])]==x[1])]))
table(sapply(dist.census,length))

# Check and fix non-matches, then rerun
unique(paste(pincodes$Districtname,pincodes$statename)[sapply(dist.census,length) ==0])

pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Karim Nagar ANDHRA PRADESH']<-'Karimnagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Mahabub Nagar ANDHRA PRADESH']<-'Mahbubnagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='K.V.Rangareddy ANDHRA PRADESH']<-'Rangareddy'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Ananthapur ANDHRA PRADESH']<-'Anantapur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Cuddapah ANDHRA PRADESH']<-'Y.S.R.'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Nellore ANDHRA PRADESH']<-'Sri Potti Sriramulu Nellore'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Pondicherry PUDUCHERRY']<-'Puducherry'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Marigaon ASSAM']<-'Morigaon'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Sibsagar ASSAM']<-'Sivasagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='North Cachar Hills ASSAM']<-'Dima Hasao'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='East Champaran BIHAR']<-'Purba Champaran'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='West Champaran BIHAR']<-'Pashchim Champaran'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Aurangabad(BH) BIHAR']<-'Aurangabad'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Bilaspur(CGH) CHHATTISGARH']<-'Bilaspur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='East Delhi NCT OF DELHI']<-'East'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='North East Delhi NCT OF DELHI']<-'North East'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='North West Delhi NCT OF DELHI']<-'North West'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='North Delhi NCT OF DELHI']<-'North'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Central Delhi NCT OF DELHI']<-'Central'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='South Delhi NCT OF DELHI']<-'South'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='South West Delhi NCT OF DELHI']<-'South West'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='West Delhi NCT OF DELHI']<-'West'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Ahmedabad GUJARAT']<-'Ahmadabad'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Gandhi Nagar GUJARAT']<-'Gandhinagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Yamuna Nagar HARYANA']<-'Yamunanagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Hamirpur(HP) HIMACHAL PRADESH']<-'Hamirpur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Leh JAMMU & KASHMIR']<-'Leh(Ladakh)'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Rajauri JAMMU & KASHMIR']<-'Rajouri'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Ananthnag JAMMU & KASHMIR']<-'Anantnag'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Budgam JAMMU & KASHMIR']<-'Badgam'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Hazaribag JHARKHAND']<-'Hazaribagh'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='East Singhbhum JHARKHAND']<-'Purbi Singhbhum'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Bijapur(KAR) KARNATAKA']<-'Bijapur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Chickmagalur KARNATAKA']<-'Chikmagalur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Davangere KARNATAKA']<-'Davanagere'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Chamrajnagar KARNATAKA']<-'Chamarajanagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Kasargod KERALA']<-'Kasaragod'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='East Nimar MADHYA PRADESH']<-'Khandwa (East Nimar)'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Raigarh(MH) MAHARASHTRA']<-'Raigarh'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Gondia MAHARASHTRA']<-'Gondiya'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Zunhebotto NAGALAND']<-'Zunheboto'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Baleswar ODISHA']<-'Baleshwar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Khorda ODISHA']<-'Khordha'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Angul ODISHA']<-'Anugul'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Sundergarh ODISHA']<-'Sundargarh'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Mohali PUNJAB']<-'Sahibzada Ajit Singh Nagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Pathankot PUNJAB']<-'Gurdaspur' # Pathankot carved out of Gurdaspur in 2011
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Nawanshahr PUNJAB']<-'Shahid Bhagat Singh Nagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Chittorgarh RAJASTHAN']<-'Chittaurgarh'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Jhujhunu RAJASTHAN']<-'Jhunjhunun'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Kanchipuram TAMIL NADU']<-'Kancheepuram'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Tiruvallur TAMIL NADU']<-'Thiruvallur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Villupuram TAMIL NADU']<-'Viluppuram'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Nilgiris TAMIL NADU']<-'The Nilgiris'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Kanyakumari TAMIL NADU']<-'Kanniyakumari'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Tuticorin TAMIL NADU']<-'Thoothukkudi'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Tiruvarur TAMIL NADU']<-'Thiruvarur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Hathras UTTAR PRADESH']<-'Mahamaya Nagar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Maharajganj UTTAR PRADESH']<-'Mahrajganj'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Raebareli UTTAR PRADESH']<-'Rae Bareli'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Haridwar UTTARAKHAND']<-'Hardwar'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='North 24 Parganas WEST BENGAL']<-'North Twenty Four Parganas'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='South 24 Parganas WEST BENGAL']<-'South Twenty Four Parganas'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Malda WEST BENGAL']<-'Maldah'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Bardhaman WEST BENGAL']<-'Barddhaman'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Hooghly WEST BENGAL']<-'Hugli'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='Howrah WEST BENGAL']<-'Haora'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='West Midnapore WEST BENGAL']<-'Paschim Medinipur'
pincodes$Districtname[paste(pincodes$Districtname,pincodes$statename) =='East Sikkim SIKKIM']<-'East District'

dist.census<-apply(pincodes[,c('Districtname','statename')],1,function(x) unique(districts$Name[which(districts$statename==x[2])][which(districts$Name[which(districts$statename==x[2])]==x[1])]))
table(sapply(dist.census,length))

# Merge in census data

names(pincodes)[which(names(pincodes)=='Districtname')]<-'Name'
pincodes<-merge(pincodes,districts[districts$TRU=='Total',c('Name','statename','Area','Total.Population.Person')],all.x=T)

# Aggregate by pincode (sum population and land area for multiple matches), calculate population density

pincodes<-aggregate(list(area=pincodes$Area,pop=pincodes$Total.Population.Person),list(pincode=pincodes$pincode),sum)
pincodes$density<-pincodes$pop/pincodes$area

# Merge back into India survey data

india<-merge(india,pincodes,by.x='Q25',by.y='pincode',all.x=T)

##########################################
# Merge in geocoded lat/long for PIN codes
##########################################

pincodes<-read.csv('all_india_PO_list_without_APS_offices_ver2_lat_long.csv',colClasses='character')

validpins<-unique(india$Q25[india$Q25 %in% pincodes$pincode])

# REPLICATION NOTE: The commented-out lines below query the Google Maps servers to geocode all valid PIN codes in the India survey data. Since we do not control the Google Maps servers, and PIN codes are subject to change, we cannot guarantee that this process is replicable. Hence, the replication file loads the geocodes that we obtained in June 2016 using these commented-out lines of code. Uncommment these two lines to re-generate these data. Note that this takes a bit of time.

#validpins.gc<-geocode(paste(validpins,'India'),output='all')
#save(validpins.gc,file='PINs_geocoded.RData')
load('PINs_geocoded.RData')

validpins.lat<-sapply(validpins.gc,function(x) x$results[[1]]$geometry$location$lat)
validpins.long<-sapply(validpins.gc,function(x) x$results[[1]]$geometry$location$lng)

# Verify geocodes
plot(validpins.long,validpins.lat)

# Fix bad geocodes and re-verify
validpins.lat[which(validpins.lat < -20)]<-NA
validpins.long[which(validpins.long < -50)]<-NA
plot(validpins.long,validpins.lat)

validpins<-data.frame(pin=validpins,lat= validpins.lat,long= validpins.long)
india<-merge(india,validpins,by.x='Q25',by.y='pin',all.x=T)

##########################################
# Calculate weights for each India sample
# based on region (5 categories), age
# (above/below median), and sex 
##########################################

# Samples for calculating weights: completed surveys only

india1<-india[india$Finished==1,]

# Read and format India population data. Restrict to ages 18+

in_pop<-read.csv('india_pop_age_state_sex.csv',skip=7,header=F,as.is=T)
in_pop<-in_pop[,c(2,4,5,7,8)]
names(in_pop)<-c('code','state','age','male','female')
in_pop<-in_pop[in_pop$state!='India',]
in_pop$state<-gsub('State - ','', in_pop$state)
in_pop$state<-gsub(' \\(.*\\)','', in_pop$state)
in_pop$age[in_pop$age=='100+']<-'100'
in_pop$age<-as.numeric(in_pop$age)
in_pop<-in_pop[in_pop$age %in% 18:100,]

# Define region variable. Using officially designated zones: https://en.wikipedia.org/wiki/Administrative_divisions_of_India#Zones

in_pop$region<-NA
in_pop$region[in_pop$state %in% toupper(c('Haryana', 'Himachal Pradesh', 'Jammu & Kashmir', 'Punjab', 'Rajasthan', 'Chandigarh'))]<-'Northern'
in_pop$region[in_pop$state %in% toupper(c('Bihar', 'Madhya Pradesh', 'Uttar Pradesh', 'Uttarakhand', 'NCT of Delhi'))]<-'North-Central'
in_pop$region[in_pop$state %in% toupper(c('Assam', 'Arunachal Pradesh', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Tripura'))]<-'North-Eastern'
in_pop$region[in_pop$state %in% toupper(c('Chhattisgarh', 'Jharkhand', 'Odisha', 'Sikkim', 'West Bengal', 'Andaman & Nicobar Islands'))]<-'Eastern'
in_pop$region[in_pop$state %in% toupper(c('Goa', 'Gujarat', 'Maharashtra', 'Dadra & Nagar Haveli', 'Daman & Diu'))]<-'Western'
in_pop$region[in_pop$state %in% toupper(c('Andhra Pradesh', 'Karnataka', 'Kerala', 'Tamil Nadu', 'Lakshadweep', 'Puducherry'))]<-'Southern'

# Checking regional distribution of population

100*prop.table(tapply(in_pop$male,in_pop$region,sum)+ tapply(in_pop$female,in_pop$region,sum))

# Merging North-Eastern (3.6% of population) with Northern (11.6%)

in_pop$region[in_pop$region=='Northern']<-'Northern-North-Eastern'
in_pop$region[in_pop$region=='North-Eastern']<-'Northern-North-Eastern'
100*prop.table(tapply(in_pop$male,in_pop$region,sum)+ tapply(in_pop$female,in_pop$region,sum))

# Calculating median age

in_pop_nat<-aggregate(list(male=in_pop$male,female=in_pop$female),list(age=in_pop$age),sum)
india_cumpct<-data.frame(age=18:100, cumpct=100*cumsum(in_pop_nat$male+ in_pop_nat$female)/sum(in_pop_nat$male+ in_pop_nat$female))
india_med_age<-india_cumpct$age[which(india_cumpct$cumpct == min(india_cumpct$cumpct[india_cumpct$cumpct >=50]))]

# Checking region, age, and sex distribution of samples to make sure we have at least 1 observation per sample in each cell. Start by defining variables region and old for the sample.

india1$region<-NA
india1$region[india1$Q24 %in% c(13,14,15,28,29,6,4,3,22,23,24,25,33)]<-'Northern-North-Eastern'
india1$region[india1$Q24 %in% c(5,20,35,34,10)]<-'North-Central'
india1$region[india1$Q24 %in% c(7,16,26,30,36,1)]<-'Eastern'
india1$region[india1$Q24 %in% c(11,12,21,8,9)]<-'Western'
india1$region[india1$Q24 %in% c(2,17,18,31,19,27,32)]<-'Southern'

india1$old<-as.numeric(india1$Q3)>=india_med_age

table(india1$old[india1$sample=='Facebook'],india1$Q20[india1$sample=='Facebook'],india1$region[india1$sample=='Facebook'])
table(india1$old[india1$sample=='MTurk'],india1$Q20[india1$sample=='MTurk'],india1$region[india1$sample=='MTurk'])
table(india1$old[india1$sample=='Qualtrics'],india1$Q20[india1$sample=='Qualtrics'],india1$region[india1$sample=='Qualtrics'])

# Calculate weights

in_pop$old<-in_pop$age >= india_med_age
in_pop_agg<-aggregate(list(male=in_pop$male,female=in_pop$female),list(region=in_pop$region,old=in_pop$old),sum)
in_pop_agg_male<-data.frame(region=in_pop_agg$region,old=in_pop_agg$old,male=T,pop=in_pop_agg$male)
in_pop_agg_female<-data.frame(region=in_pop_agg$region,old=in_pop_agg$old,male=F,pop=in_pop_agg$female)
pop_tot<-rbind(in_pop_agg_female, in_pop_agg_male)
pop_tot$prop<-pop_tot$pop/sum(pop_tot$pop)

india1$male<-india1$Q20==1
pop_fb<-with(india1[india1$sample=='Facebook',],aggregate(list(num=region),list(region=region,old=old,male=male),length))
pop_fb$prop<-pop_fb$num/sum(pop_fb$num)
pop_fb$weight<-pop_tot$prop/pop_fb$prop
pop_fb$sample<-'Facebook'
pop_mt<-with(india1[india1$sample=='MTurk',],aggregate(list(num=region),list(region=region,old=old,male=male),length))
pop_mt$prop<-pop_mt$num/sum(pop_mt$num)
pop_mt$weight<-pop_tot$prop/pop_mt$prop
pop_mt$sample<-'MTurk'
pop_qt<-with(india1[india1$sample=='Qualtrics',],aggregate(list(num=region),list(region=region,old=old,male=male),length))
pop_qt$prop<-pop_qt$num/sum(pop_qt$num)
pop_qt$weight<-pop_tot$prop/pop_qt$prop
pop_qt$sample<-'Qualtrics'

weights<-rbind(pop_fb, pop_mt, pop_qt)

# Merge weights into data frame by sample.

india1<-merge(india1,weights[,c('region','old','male','sample','weight')])

# Remove redundant "male" variable and reorder to match india.RData, then save.
india1$male<-NULL
india1<-india1[,c(varnames.india, setdiff(names(india1),varnames.india))]

save(india1, file='india_completions_augmented.RData')